package cn.ihoway.analysis;

import cn.ihoway.util.hStringUtils;

import java.util.ArrayList;
import java.util.HashMap;

public class DocAnalyzer {

    /**
     * 文档切词分析
     * @param doc 文档内容
     * @return 返回各词语在文档中的具体位置信息，如：处理 [4:6, 67:69, 152:154]，其中$cnt [97]表示该文档总共存在97个词语
     */
    public HashMap<String, ArrayList<String>> docSeg(String doc){
        HashMap<String, ArrayList<String>> result = new HashMap<>();
        //全部转为半角
        doc = hStringUtils.ToDBC(doc);
        //按符号进行拆分
        String[] sentences = doc.split("(?i)[^a-zA-Z+1-9\u4E00-\u9FA5]");
        //对每个句子进行分割
        SentAnalyzer sentAnalyzer = new SentAnalyzer();
        int index = 0;
        int cnt = 0;//词语总数
        for (String sentence:sentences){
            HashMap<String, ArrayList<String>> res = sentAnalyzer.sentAnalyze(sentence,index);
            ArrayList<String> cntArray = res.get("$cnt");
            cnt += Integer.parseInt(cntArray.get(0));
            index += sentence.length()+1;
            ArrayList<String> list;
            for(String key:res.keySet()) {
                if(result.containsKey(key)){
                    list = result.get(key);
                    list.addAll(res.get(key));
                    result.put(key,list);
                }else {
                    result.put(key,res.get(key));
                }
            }
        }
        ArrayList<String> totalArray = new ArrayList<>();
        totalArray.add(String.valueOf(cnt));
        result.put("$cnt",totalArray);
        return result;
    }


}
